package cz.semjobKB.extract.api.impl;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import cz.semjobKB.extract.api.IExtractText;

@Component
public class ExtractFromDoc implements IExtractText {

	private static Logger logger = Logger.getLogger(ExtractFromDoc.class);
	
	@Autowired
	private PostProcessing postProcessing;
	
	
	public String extractText(File file) throws IOException {
		
		logger.info("Extracting text from Microsoft Word document: " + file.getAbsolutePath());		
		
		InputStream streamIn = new FileInputStream(file);
		HWPFDocument inputStream = new HWPFDocument(streamIn);
		WordExtractor extractor = new WordExtractor(inputStream);
		String[] textParts = extractor.getParagraphText();
		String output = "";
		for (String paragraph : textParts) {
			output += paragraph.replaceAll("[\u0000-\u0015]", "") + "\n";
		}
		
		output = postProcessing.removeEndings(output);
		
		logger.info("Text from Microsoft Word has been extracted.");
		
		return output;
		
	}

}
